I'm making a Discord Bot in Python to scrape Hack The Box data.
This is already functional, but I want to use async with aiohttp for increase speed when I'm requesting each profile of each member.
So in the synchronous version, I made a login function that first make a get request, to get the token on the login page, then make a post request with the token, email and password.
And in the asynchronous version with aiohttp, when I do my post request, my session is not connected.
I shortened it a little bit just for performance testing:
import requests
import re
import json
from scrapy.selector import Selector
import config as cfg
from timeit import default_timer
class HTBot():
def __init__(self, email, password, api_token=""):
self.email = email
self.password = password
self.api_token = api_token
self.session = requests.Session()
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36"
}
self.payload = {'api_token': self.api_token}
if path.exists("users.txt"):
with open("users.txt", "r") as f:
self.users = json.loads(f.read())
else:
self.users = []
def login(self):
req = self.session.get("https://www.hackthebox.eu/login", headers=self.headers)
html = req.text
csrf_token = re.findall(r'type="hidden" name="_token" value="(.+?)"', html)
if not csrf_token:
return False
data = {
"_token": csrf_token[0],
"email": self.email,
"password": self.password
}
req = self.session.post("https://www.hackthebox.eu/login", data=data, headers=self.headers)
if req.status_code == 200:
print("Connecté à HTB !")
self.session.headers.update(self.headers)
return True
print("Connexion impossible.")
return False
def extract_user_info(self, htb_id):
infos = {}
req = self.session.get("https://www.hackthebox.eu/home/users/profile/" + str(htb_id), headers=self.headers)
if req.status_code == 200:
body = req.text
html = Selector(text=body)
infos["username"] = html.css('div.header-title > h3::text').get().strip()
infos["avatar"] = html.css('div.header-icon > img::attr(src)').get()
infos["points"] = html.css('div.header-title > small > span[title=Points]::text').get().strip()
infos["systems"] = html.css('div.header-title > small > span[title="Owned Systems"]::text').get().strip()
infos["users"] = html.css('div.header-title > small > span[title="Owned Users"]::text').get().strip()
infos["respect"] = html.css('div.header-title > small > span[title=Respect]::text').get().strip()
infos["country"] = Selector(text=html.css('div.header-title > small > span').getall()[4]).css('span::attr(title)').get().strip()
infos["level"] = html.css('div.header-title > small > span::text').extract()[-1].strip()
infos["rank"] = re.search(r'position (\d+) of the Hall of Fame', body).group(1)
infos["challs"] = re.search(r'has solved (\d+) challenges', body).group(1)
infos["ownership"] = html.css('div.progress-bar-success > span::text').get()
return infos
return False
def refresh_user(self, htb_id, new=False):
users = self.users
for user in users:
if user["htb_id"] == htb_id:
infos = self.extract_user_info(htb_id)
def refresh_all_users(self):
users = self.users
for user in users:
self.refresh_user(user["htb_id"])
elapsed = default_timer() - START_TIME
time_completed_at = "{:5.2f}s".format(elapsed)
print("{0:<30} {1:>20}".format(user["username"], time_completed_at))
print("Les users ont été mis à jour !")
htbot = HTBot(cfg.HTB['email'], cfg.HTB['password'], cfg.HTB['api_token'])
htbot.login()
START_TIME = default_timer()
htbot.refresh_all_users()
Then, my async rewrite only for the login function :
import asyncio
import re
import config as cfg
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36"
}
LOGIN_LOCK = asyncio.Lock()
async def login():
async with LOGIN_LOCK:
async with aiohttp.TCPConnector(share_cookies=True) as connector:
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
async with session.get("https://www.hackthebox.eu/login") as req:
html = await req.text()
csrf_token = re.findall(r'type="hidden" name="_token" value="(.+?)"', html)
if not csrf_token:
return False
payload = {
"_token": csrf_token[0],
"email": cfg.HTB['email'],
"password": cfg.HTB['password']
}
async with session.post('https://www.hackthebox.eu/login', data=payload) as req:
print(await req.text())
exit()
async def main():
await login()
asyncio.run(main())
I think I'm going too far with this BaseConnector, Locks etc but I've been working on it for two days now and I'm running out of ideas, I'm already trying to connect with this post request.
I also did a comparison of the two requests with Requests and aiohttp in Wireshark.
The only difference is that the one with aiohttp doesn't send keepalive and has cookies. (I already tried to manually set the header "connection: keep-alive" but it doesn't change anything).
However, according to the documentation, keep-alive should be active by default, so I don't understand.
(In the screen the 301 status codes are normal, for seeing my HTTP requests I had to use http instead of https.)
Screen of Wireshark : https://files.catbox.moe/bignh0.PNG
Thank you if you can help me !
Since I'm new to asynchronous programming, I'll take all your advice.
Unfortunately almost everything I read about it on the internet is deprecated for Python 3.7+ and doesn't use the new syntaxes.
Okay, I have finally switched to httpx and it worked like a charm.
I really don't know why aiohttp wouldn't work.
Related
I am using python-proxy package for converting my ssh connection to a sock5 proxy, facing issues in making sock5 http request after creating server.
Using python package:https://github.com/qwj/python-proxy & reference example:https://github.com/qwj/python-proxy/blob/master/tests/api_server.py
import asyncio
import pproxy
loop = asyncio.get_event_loop()
async def make_request():
import requests
import socks
import socket
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "localhost", 8081)
socket.socket = socks.socksocket
proxies = {'http': "socks5://127.0.0.1:8081"}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
}
url = u'https://api.ipgeolocationapi.com/geolocate/5.152.122.170'
print(requests.get(url,verify=False,headers=headers).text)
return "test"
async def ssh_handle():
print("1")
server = pproxy.Server('socks5://127.0.0.1:8081')
remote = pproxy.Connection('ssh://185.110.12.11/#root:test')
args = dict(rserver=[remote],
verbose=print)
await server.start_server(args)
print("server started now")
await asyncio.sleep(1)
await make_request() #-- after creating server calling function of making http request using proxy
return "done"
try:
loop.run_until_complete(ssh_handle())
loop.run_forever()
except Exception as e:
print(e)```
[1]: https://github.com/qwj/python-proxy
[2]: https://github.com/qwj/python-proxy/blob/master/tests/api_server.py
I am trying to scrape some basic data from amazon via python. Everything works great but I can not pass captcha control in most of cases.
I tried to wait longer with sleep function but it doesn't work.
Is there any way to work around captcha control?
........
def parse(url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
}
try:
# Retrying for failed requests
for i in range(40):
# Generating random delays
sleep(randint(1,10))
# Adding verify=False to avold ssl related issues
response = requests.get(url, headers=headers, verify=False)
if response.status_code == 200:
doc = html.fromstring(response.content)
XPATH_NAME = '//h1[#id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(#id,"ourprice") or contains(#id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[#class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[#id="availability"]//text()'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
# retrying in case of captcha
if not NAME:
raise ValueError('captcha')
data = {
'NAME': NAME,
'SALE_PRICE': SALE_PRICE,
'CATEGORY': CATEGORY,
'ORIGINAL_PRICE': ORIGINAL_PRICE,
'AVAILABILITY': AVAILABILITY,
'URL': url,
}
return data
elif response.status_code==404:
break
except Exception as e:
print (e)
def ReadAsin():
..........
As you know, Instagram announced they has changed their endpoint apis this month.
Looks like in the wake of Cambridge Analytica instagram has changed up their endpoint formats and require a logged in user session for all requests.....
Not sure which endpoints need updating but I was specifically using the media/comments endpoints which are now as follows:
Media OLD:
https://www.instagram.com/graphql/query/?query_id=17888483320059182&id={0}&first=100&after={1}
Media NEW:
https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%7B%22id%22%3A%2221575514%22%2C%22first%22%3A12%2C%22after%22%3A%22AQAHXuz1DPmI3FFLOzy5iKEhHOLKw3lt_ozVR40TphSdns0Vp5j_ZEU6Qj0CW6IqNtVGO5pmLCQoX0Y8RVS9aRTT2lWPp6vf8vFqjo1QfxRYmA%22%7D
The script that I used for avoiding this problem is as following:
#!/usr/bin/env python3
import requests
import urllib.parse
import hashlib
import json
#CHROME_UA = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
def getSession_old(rhx_gis, csrf_token, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
print(variables)
values = "%s:%s:%s:%s" % (
rhx_gis,
csrf_token,
CHROME_UA,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'user-agent': CHROME_UA,
'x-instagram-gis': x_instagram_gis
}
print(x_instagram_gis)
session.cookies.set('ig_pr', '2')
session.cookies.set('csrftoken', csrf_token)
return session
def getSession(rhx_gis, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
values = "%s:%s" % (
rhx_gis,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'x-instagram-gis': x_instagram_gis
}
return session
if __name__ == '__main__':
session = requests.Session()
session.headers = { 'user-agent': CHROME_UA }
response = session.get("https://www.instagram.com/selenagomez")
data = json.loads(response.text.split("window._sharedData = ")[1].split(";</script>")[0])
csrf = data['config']['csrf_token']
rhx_gis = data['rhx_gis']
variables = '{"id":"460563723","first":10,"after":"AQBf8puhlt8nU2JzmYdMMTuH0FbMgUM1fnIOZIH7n94DM4VLWkVILUAKVB-5dqvxQEI-Wd0ttlEDzimaaqwC98jccQaDQT4tSF56c_NlWi_shg"}'
session = getSession(rhx_gis, variables)
query_hash = '42323d64886122307be10013ad2dcc44'
encoded_vars = urllib.parse.quote(variables, safe='"')
url = 'https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s' % (query_hash, encoded_vars)
print(url)
print(session.get(url).text)
I am sure this script was working well before 11 days ago, but not working now.
Does anyone know the solution how to get user posts without authenticating?
Login Address: https://joffice.jeunesseglobal.com/login.asp.
Two data need to put: Username and pw.
Using cookie to access:https://joffice.jeunesseglobal.com/members/back_office.asp
Can't login.
#-*-coding:utf8-*-
import urllib
import http.cookiejar
url = 'https://joffice.jeunesseglobal.com/members/back_office.asp'
login_url = "https://joffice.jeunesseglobal.com/login.asp"
login_username = "jianghong181818"
login_password = "Js#168168!"
login_data = {
"Username" : login_username,
"pw" : login_password,
}
post_data = urllib.parse.urlencode(login_data).encode('utf-8')
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
req = urllib.request.Request(login_url, headers = headers, data = post_data)
cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
resp = opener.open(req)
print(resp.read().decode('utf-8'))
Use requests
Simple way:
>>>import requests
>>>page = requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=
('username', 'password'))
Making requests with HTTP Basic Auth
>>> from requests.auth import HTTPBasicAuth
>>> requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=HTTPBasicAuth('user', 'pass'))
I'm trying to login scrape tumblr but as when you log into the website normally through a browser, it kind of has two steps (enter email first and it check whether there is an account associated with that email then you can enter your password if the email is correct). Unfortunately, this flags up some problems when trying to automate this login without using the requests module (I'm trying to do it using urllib.request and urllib.parse which are already available in python 3.6) as there is no explicit way to start a session so you can keep the same session for the email verification and then entering the email.
Do I need to use cookies to do this or will I have to install the requests module? My code so far looks a bit like this:
import urllib.request
import urllib.parse
from html.parser import HTMLParser
input_tags = []
class myHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == "input":
for i in range(len(attrs)):
if attrs[i][0] == "name" and attrs[i][1] == "form_key":
input_tags.append(attrs[i+1][1])
parser = myHTMLParser()
form_key = ""
def get_form_key():
global form_key
global input_tags
url = "https://www.tumblr.com/login"
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)
resp = resp.read()
parser.feed(str(resp))
print(input_tags)
form_key = input_tags
print("form key is : ", form_key)
if len(form_key) > 1:
form_key = form_key[:1]
print("\nform key should be one value long now: ", form_key)
get_form_key()
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"
url = "https://www.tumblr.com/login"
login_data = {
"determine_email" : "my.email#email.com",
"user[email]" : "my.email#email.com",
"user[password]" : "secretpassword",
"tumblrlog[name]" : "",
"user[age]" : "",
"http_referer" : "https://www.tumblr.com/logout",
"form_key" : form_key
}
encoded_data = urllib.parse.urlencode(data)
encoded_data = encoded_data.encode("utf-8")
request = urllib.request.Request(url, headers = headers, data = encoded_data)
response = urllib.request.urlopen(request)
response_url = response.geturl()
print(response_url)
This prints out the form key twice (not that important that's just from me bug checking) and then it returns the url:
https://www.tumblr.com/login
which indicates that the loin was not successful.
Any idea how to fix this?