Not able to download images from google` - python-3.x

Below is the code to download images from google. When I run the code below, only 80 images are downloaded.
import os
import requests
from bs4 import BeautifulSoup
GOOGLE_IMAGE = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
# PATH = r'C:\Program Files\edge\msedgedriver.exe'
usr_agent = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.63',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
}
SAVE_FOLDER = 'C:\\Users\\sansh\\Desktop\\Py\scrape\\new'
def main():
if not os.path.exists(SAVE_FOLDER):
os.mkdir(SAVE_FOLDER)
download_images()
def download_images():
data = input('What are u searching for?')
n_images = int(input('How many images do you want?'))
print('Start searching ...')
search_url = GOOGLE_IMAGE + 'q=' + data.replace('_', '+')
print(search_url)
response = requests.get(search_url, headers=usr_agent)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all('img', {'class': 'rg_i Q4LuWd'}, limit=n_images)
image_links = []
for res in results:
try:
link = res['data-src']
image_links.append(link)
except KeyError:
continue
print(f'Found {len(image_links)} images')
print("Starting downloader ...")
for i, image_link in enumerate(image_links):
down = requests.get(image_link)
image_name = SAVE_FOLDER + '/' + data + str(i + 1) + '.jpg'
with open(image_name, 'wb') as file:
file.write(down.content)
print("Download completed ...")
if __name__ == '__main__':
main()
I am not sure what the error is. Also, no error is shown. If possible, can anyone help me solve this problem?
Here, is the screenshot of the result after running this code.
[screenshot][1]

Related

requests_html does not render javascript webpages

Why my code doesn't render the javascript pages using requests_html and beautifulSoup
from requests_html import HTMLSession
from bs4 import BeautifulSoup
session = HTMLSession()
def track(num):
url = f'https://www.trackingmore.com/track/en/{num}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0'}
r = session.post(url,headers=headers)
# r.html.render(timeout=20)
res = []
soup = BeautifulSoup(r.content,'lxml')
st = soup.find('div',class_ ="track-status uk-flex")
print(st.text)
if st != 'Not Found':
checkpoint = soup.find_all('div', class_="info-checkpoint")
for i in checkpoint:
date = i.find('div',class_='info-date').text.strip()
desc = i.find('div',class_='info-desc').text.strip()
res.append({
'Date':date.replace('\xa0','')[:19],
'Description':desc.replace('\xa0','')
})
return res
else :
return res
The output is like this, I can't get the value inside of each javascript function
[{'Date': '{{info.Date}} {{inf', 'Description': '{{info.StatusDescription}}'}, {'Date': '{{info.Date}} {{inf', 'Description': '{{info.StatusDescription}}'}]

WebScraping / Identical sites not working?

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:
The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)
Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

Scraping a Video from a website and downloading it as mp4

I am trying to scrape this webiste and download the soccer goal that is in it. I saw this Stackoverflow post and I tried the solution and it still does not work.
Here is the code I have done
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4p'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
when I printed the request it showed me 403 and because of that the video will not open or get downloaded, any thoughts on how I can download the video?
Use this below url.
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4?Expires=1621994280&Signature=IqySuJxyVi9pCmC~JUhl-iyp-LmG6OiAFfQeu-~-a55osCfu9VrksEhzaQzJlMxAHcSt1R4j9Pt-G8sblQeFt3UtGqY-neHJkC4mUxuHjxGWAWdksyiAxkMb8DYRLkvIseUfkbKbeO6Dt807QwMkspFmXYdzljm8DLho6nMQfC--jtfy8B2gONhA9YUmK2o~fUHwTHzTXXqNGct2hQl-B9cFLDBdj8LXWTj-75YInwWxLwtoenKK~qLahGtJXKXvxTVltxMvUYXXvP9F~WfhNIhNqns1JKrrrqJ~N1XunZHCv~IVJyzOEvrn2G4J5LMIn~dcEZ9frV3APHsE4D~HQA__&Key-Pair-Id=APKAIEYUVEN4EVB2OKEQ'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
This is how you extract the video link from Streamable if you want to in a programmatic way
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = "https://streamable.com/a50s3e"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# Pull all text from the BodyText div
video = soup.find_all('video')
temp = video[0].get("src")
url = "https:"+temp
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)

'str' object has no attribute 'text' in BeautifulSoup4

I'm trying to login to a website using python Request for an automation purpose. And I'm getting an error as 'str' object has no attribute 'text'
My code is
from BeautifulSoup import BeautifulSoup
logging.basicConfig(filename='app.log',level=logging.INFO)
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def post_request_curl(data):
try:
with requests.Session() as s:
login_data=data['login_details']
r = s.get(url=data['url'], headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
login_data['form_build_id'] = soup.find_all('input', attrs={'name': 'form_build_id'})['value']
r = s.post(url=data['url'], data=login_data, headers=headers)
return r
except Exception as e:
logging.error('Error occurred ' + str(e))
Try:
from bs4 import BeautifulSoup
import requests
logging.basicConfig(filename='app.log',level=logging.INFO)
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def post_request_curl(data):
try:
with requests.Session() as s:
login_data=data['login_details']
r = s.get(url=data['url'], headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
login_data['form_build_id'] = soup.find_all('input', attrs={'name': 'form_build_id'})['value']
r = s.post(url=data['url'], data=login_data, headers=headers)
return r
except Exception as e:
logging.error('Error occurred ' + str(e))

How to pass captcha while scraping amazon website

I am trying to scrape some basic data from amazon via python. Everything works great but I can not pass captcha control in most of cases.
I tried to wait longer with sleep function but it doesn't work.
Is there any way to work around captcha control?
........
def parse(url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
}
try:
# Retrying for failed requests
for i in range(40):
# Generating random delays
sleep(randint(1,10))
# Adding verify=False to avold ssl related issues
response = requests.get(url, headers=headers, verify=False)
if response.status_code == 200:
doc = html.fromstring(response.content)
XPATH_NAME = '//h1[#id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(#id,"ourprice") or contains(#id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[#class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[#id="availability"]//text()'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
# retrying in case of captcha
if not NAME:
raise ValueError('captcha')
data = {
'NAME': NAME,
'SALE_PRICE': SALE_PRICE,
'CATEGORY': CATEGORY,
'ORIGINAL_PRICE': ORIGINAL_PRICE,
'AVAILABILITY': AVAILABILITY,
'URL': url,
}
return data
elif response.status_code==404:
break
except Exception as e:
print (e)
def ReadAsin():
..........

Resources