How to pass captcha while scraping amazon website - python-3.x

I am trying to scrape some basic data from amazon via python. Everything works great but I can not pass captcha control in most of cases.
I tried to wait longer with sleep function but it doesn't work.
Is there any way to work around captcha control?
........
def parse(url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
}
try:
# Retrying for failed requests
for i in range(40):
# Generating random delays
sleep(randint(1,10))
# Adding verify=False to avold ssl related issues
response = requests.get(url, headers=headers, verify=False)
if response.status_code == 200:
doc = html.fromstring(response.content)
XPATH_NAME = '//h1[#id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(#id,"ourprice") or contains(#id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[#class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[#id="availability"]//text()'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
# retrying in case of captcha
if not NAME:
raise ValueError('captcha')
data = {
'NAME': NAME,
'SALE_PRICE': SALE_PRICE,
'CATEGORY': CATEGORY,
'ORIGINAL_PRICE': ORIGINAL_PRICE,
'AVAILABILITY': AVAILABILITY,
'URL': url,
}
return data
elif response.status_code==404:
break
except Exception as e:
print (e)
def ReadAsin():
..........

Related

Not able to download images from google`

Below is the code to download images from google. When I run the code below, only 80 images are downloaded.
import os
import requests
from bs4 import BeautifulSoup
GOOGLE_IMAGE = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
# PATH = r'C:\Program Files\edge\msedgedriver.exe'
usr_agent = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.63',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
}
SAVE_FOLDER = 'C:\\Users\\sansh\\Desktop\\Py\scrape\\new'
def main():
if not os.path.exists(SAVE_FOLDER):
os.mkdir(SAVE_FOLDER)
download_images()
def download_images():
data = input('What are u searching for?')
n_images = int(input('How many images do you want?'))
print('Start searching ...')
search_url = GOOGLE_IMAGE + 'q=' + data.replace('_', '+')
print(search_url)
response = requests.get(search_url, headers=usr_agent)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all('img', {'class': 'rg_i Q4LuWd'}, limit=n_images)
image_links = []
for res in results:
try:
link = res['data-src']
image_links.append(link)
except KeyError:
continue
print(f'Found {len(image_links)} images')
print("Starting downloader ...")
for i, image_link in enumerate(image_links):
down = requests.get(image_link)
image_name = SAVE_FOLDER + '/' + data + str(i + 1) + '.jpg'
with open(image_name, 'wb') as file:
file.write(down.content)
print("Download completed ...")
if __name__ == '__main__':
main()
I am not sure what the error is. Also, no error is shown. If possible, can anyone help me solve this problem?
Here, is the screenshot of the result after running this code.
[screenshot][1]

How to do properly a facebook mobile site login

I'm trying to develop some code in order to make successful facebook logins. Now, to simplify as much as possible, i use the mbasic.facebook.com address.
My code is the following (using requests in python latest version):
if __name__ == '__main__':
base_url = 'https://mbasic.facebook.com'
with requests.session() as session:
user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.87 Safari/537.36"
)
cookie = 'locale=it_IT;'
default_headers = {
'User-Agent': user_agent,
'Accept-Language': 'it-IT,en;q=0.5',
'cookie': cookie,
}
session.headers.update(default_headers)
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A%2F%2Fmbasic.facebook.com%2F&lwv=100&ref' \
'=dbl '
r = session.get("https://mbasic.facebook.com/login/")
page1 = BeautifulSoup(r.text, "lxml")
form = page1.find('form')
lsd = page1.find('input', {'name': 'lsd'})['value']
jazoest = page1.find('input', {'name': 'jazoest'})['value']
mts = page1.find('input', {'name': 'm_ts'})['value']
li = page1.find('input', {'name': 'li'})['value']
try_number = page1.find('input', {'name': 'try_number'})['value']
unrecognized_tries = page1.find('input', {'name': 'unrecognized_tries'})['value']
data = {'lsd': lsd, 'jazoest': jazoest, 'm_ts': mts, 'li': li, 'try_number': try_number,
'unrecognized_tries': unrecognized_tries, 'email': credentials["email"], 'pass': credentials["pass"],
'login': 'Accedi'}
r = session.post(base_url + login_form_url, data=data, verify=False)
# now, i need to complete the second part of the login
h = open("first_login.html", "w", encoding="utf-8")
h.write(r.text)
c = BeautifulSoup(r.text, "lxml")
form = c.find('a')
action = form.get('href')
r = session.get(base_url + action, data=data, verify=False)
f = open("second_login.html", "w", encoding="utf-8")
f.write(r.text)
Now, with this code i successfully get my home feed as a logged user. However, the problem begins when i try to move for instance to one specific facebook public page, because it returns me the page as if i wasn't logged in. The same weird thing happens when i try to get a specific post, because it doesn't show me any comments, like it does in my browser.
I tried to play with session cookies but to no avail.
Help
The solution was to change the user agent to:
Mozilla/5.0 (BB10; Kbd) AppleWebKit/537.35+ (KHTML, like Gecko) Version/10.3.3.3057 Mobile Safari/537.35+

Why reddit return 502 error when accessing a page using beautifulsoup4

I have written a small script to check the username in popular websites like Facebook, Instagram, etc.
Here is the code.
import requests
from termcolor import colored, cprint
from time import sleep
from bs4 import BeautifulSoup
status_code_html = 'https://en.wikipedia.org/wiki/List_of_HTTP_status_codes'
uname = input("Enter the username: ")
width = 10
websites = {
'Facebook': 'https://www.facebook.com/',
'Twitter': 'https://twitter.com/',
'Instagram': 'https://www.instagram.com/',
'Youtube': 'https://www.youtube.com/user/',
'Reddit': 'https://www.reddit.com/user/'
}
for site, url in websites.items():
try:
response = requests.get(url+uname)
page = requests.get(status_code_html)
soup = BeautifulSoup(page.content, 'html.parser')
tag = soup.find(id=response.status_code)
status = tag.find_parent('dt').text
response.raise_for_status()
except:
print(site.rjust(width), ' :', 'Fail'.ljust(width), '(Status:', status, ')')
else:
print(site.rjust(width), ' :', 'Success'.ljust(width), '(Status:', status, ')')
Output of the above code is
Enter the username: ********
Facebook : Success (Status: 200 OK )
Twitter : Success (Status: 200 OK )
Instagram : Success (Status: 200 OK )
Youtube : Success (Status: 200 OK )
Reddit : Fail (Status: 502 Bad Gateway )
This code works for all website except reddit.com. requests.get() return a 502 error page. Can someone help resolve this issue?
Adding the the user agent in the headers parameter should fix that:
import requests
from termcolor import colored, cprint
from time import sleep
from bs4 import BeautifulSoup
status_code_html = 'https://en.wikipedia.org/wiki/List_of_HTTP_status_codes'
uname = input("Enter the username: ")
width = 10
headers = {'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36'}
websites = {
'Facebook': 'https://www.facebook.com/',
'Twitter': 'https://twitter.com/',
'Instagram': 'https://www.instagram.com/',
'Youtube': 'https://www.youtube.com/user/',
'Reddit': 'https://www.reddit.com/user/'
}
for site, url in websites.items():
try:
response = requests.get(url+uname, headers=headers)
page = requests.get(status_code_html)
soup = BeautifulSoup(page.content, 'html.parser')
tag = soup.find(id=response.status_code)
status = tag.find_parent('dt').text
response.raise_for_status()
except:
print(site.rjust(width), ' :', 'Fail'.ljust(width), '(Status:', status, ')')
else:
print(site.rjust(width), ' :', 'Success'.ljust(width), '(Status:', status, ')')

aiohttp: Trying to connect to a site

I'm making a Discord Bot in Python to scrape Hack The Box data.
This is already functional, but I want to use async with aiohttp for increase speed when I'm requesting each profile of each member.
So in the synchronous version, I made a login function that first make a get request, to get the token on the login page, then make a post request with the token, email and password.
And in the asynchronous version with aiohttp, when I do my post request, my session is not connected.
I shortened it a little bit just for performance testing:
import requests
import re
import json
from scrapy.selector import Selector
import config as cfg
from timeit import default_timer
class HTBot():
def __init__(self, email, password, api_token=""):
self.email = email
self.password = password
self.api_token = api_token
self.session = requests.Session()
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36"
}
self.payload = {'api_token': self.api_token}
if path.exists("users.txt"):
with open("users.txt", "r") as f:
self.users = json.loads(f.read())
else:
self.users = []
def login(self):
req = self.session.get("https://www.hackthebox.eu/login", headers=self.headers)
html = req.text
csrf_token = re.findall(r'type="hidden" name="_token" value="(.+?)"', html)
if not csrf_token:
return False
data = {
"_token": csrf_token[0],
"email": self.email,
"password": self.password
}
req = self.session.post("https://www.hackthebox.eu/login", data=data, headers=self.headers)
if req.status_code == 200:
print("Connecté à HTB !")
self.session.headers.update(self.headers)
return True
print("Connexion impossible.")
return False
def extract_user_info(self, htb_id):
infos = {}
req = self.session.get("https://www.hackthebox.eu/home/users/profile/" + str(htb_id), headers=self.headers)
if req.status_code == 200:
body = req.text
html = Selector(text=body)
infos["username"] = html.css('div.header-title > h3::text').get().strip()
infos["avatar"] = html.css('div.header-icon > img::attr(src)').get()
infos["points"] = html.css('div.header-title > small > span[title=Points]::text').get().strip()
infos["systems"] = html.css('div.header-title > small > span[title="Owned Systems"]::text').get().strip()
infos["users"] = html.css('div.header-title > small > span[title="Owned Users"]::text').get().strip()
infos["respect"] = html.css('div.header-title > small > span[title=Respect]::text').get().strip()
infos["country"] = Selector(text=html.css('div.header-title > small > span').getall()[4]).css('span::attr(title)').get().strip()
infos["level"] = html.css('div.header-title > small > span::text').extract()[-1].strip()
infos["rank"] = re.search(r'position (\d+) of the Hall of Fame', body).group(1)
infos["challs"] = re.search(r'has solved (\d+) challenges', body).group(1)
infos["ownership"] = html.css('div.progress-bar-success > span::text').get()
return infos
return False
def refresh_user(self, htb_id, new=False):
users = self.users
for user in users:
if user["htb_id"] == htb_id:
infos = self.extract_user_info(htb_id)
def refresh_all_users(self):
users = self.users
for user in users:
self.refresh_user(user["htb_id"])
elapsed = default_timer() - START_TIME
time_completed_at = "{:5.2f}s".format(elapsed)
print("{0:<30} {1:>20}".format(user["username"], time_completed_at))
print("Les users ont été mis à jour !")
htbot = HTBot(cfg.HTB['email'], cfg.HTB['password'], cfg.HTB['api_token'])
htbot.login()
START_TIME = default_timer()
htbot.refresh_all_users()
Then, my async rewrite only for the login function :
import asyncio
import re
import config as cfg
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36"
}
LOGIN_LOCK = asyncio.Lock()
async def login():
async with LOGIN_LOCK:
async with aiohttp.TCPConnector(share_cookies=True) as connector:
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
async with session.get("https://www.hackthebox.eu/login") as req:
html = await req.text()
csrf_token = re.findall(r'type="hidden" name="_token" value="(.+?)"', html)
if not csrf_token:
return False
payload = {
"_token": csrf_token[0],
"email": cfg.HTB['email'],
"password": cfg.HTB['password']
}
async with session.post('https://www.hackthebox.eu/login', data=payload) as req:
print(await req.text())
exit()
async def main():
await login()
asyncio.run(main())
I think I'm going too far with this BaseConnector, Locks etc but I've been working on it for two days now and I'm running out of ideas, I'm already trying to connect with this post request.
I also did a comparison of the two requests with Requests and aiohttp in Wireshark.
The only difference is that the one with aiohttp doesn't send keepalive and has cookies. (I already tried to manually set the header "connection: keep-alive" but it doesn't change anything).
However, according to the documentation, keep-alive should be active by default, so I don't understand.
(In the screen the 301 status codes are normal, for seeing my HTTP requests I had to use http instead of https.)
Screen of Wireshark : https://files.catbox.moe/bignh0.PNG
Thank you if you can help me !
Since I'm new to asynchronous programming, I'll take all your advice.
Unfortunately almost everything I read about it on the internet is deprecated for Python 3.7+ and doesn't use the new syntaxes.
Okay, I have finally switched to httpx and it worked like a charm.
I really don't know why aiohttp wouldn't work.

Why using cookie to login a website failed in python3?

I want to use cookie that i have got to access a website and get some information only can be seen after user login, but when me try it, the result shows the user isn't login the website, this is my code, someone can tell me how to fix the problem?
LOGIN_URL ="https://www.yaozh.com/login/"
values = {'username': 'username', 'pwd': 'password'} # , 'submit' : 'Login'
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
page = response.read().decode()
# print(page)
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
cookie.save(ignore_discard=True, ignore_expires=True)
print(cookie)
for item in cookie:
print('Name = ' + item.name)
print('Value = ' + item.value)
get_url = 'https://db.yaozh.com/instruct?p=1&pageSize=20'
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print(get_response.read())
bs=BeautifulSoup(get_response,"html.parser")
urls=bs.find_all(name='a',attrs={"href":re.compile("\.doc")},recursive=True)
print(len(urls))
for url in urls:
print(url["href"])
The problem has been solved, if you face same problem, i guess you should chcek the information post to server is all? Many websites need some imformation that true users can't see to judge the requester is true user. Good luck for you!

Resources