Aiohttp+Asyncio Seems To Be Inconsistent in Tripadvisor Travel Site - python-3.x

I was trying to asynchronously request page data from tripadvisor travel site using aiohttp+asyncio, but it seems that in multiple occasions, the get() method is stuck for almost a minute and then results in TimeoutError.
I created a similar script using the requests library and confirmed that there are times that the code with requests library works while the code with aiohttp+asyncio does not.
Here are the codes:
Using aiohttp + asyncio
from aiohttp import ClientSession
import asyncio
home_url = 'https://www.tripadvisor.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/93.0.4577.63 Safari/537.36'
}
async def main():
async with ClientSession(headers=headers) as session:
tourist_sites_url = home_url + '/Attractions-g294245-Activities-a_allAttractions.true-Philippines.html'
async with session.get(tourist_sites_url) as response:
print(f'{response.status=}\n')
print(await response.text())
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Using requests
from requests import Session
home_url = 'https://www.tripadvisor.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/93.0.4577.63 Safari/537.36'
}
def main():
with Session() as session:
tourist_sites_url = home_url + '/Attractions-g294245-Activities-a_allAttractions.true-Philippines.html'
response = session.get(tourist_sites_url, headers=headers)
print(f'{response.status_code=}\n')
print(response.text)
if __name__ == '__main__':
main()
What shall I do in order for the code with aiohttp+asyncio to work on tripadvisor website?
Thank you very much!

Related

Can not download excel file using requests python, I can't get the third step of posting request to download excel file. here is my try

Here is my attempt to download excel file ##----------
How Do i make it work. Can someone please help me to fix last call
import requests
from bs4 import BeautifulSoup
url = "http://lijekovi.almbih.gov.ba:8090/SpisakLijekova.aspx"
useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.76"
headers={
"User-Agent":useragent
}
session = requests.session() #session
r = session.get(url,headers=headers) #request to get cookies
soup = BeautifulSoup(r.text,"html.parser") #parsing values
viewstate = soup.find('input', {'id': '__VIEWSTATE'}).get('value')
viewstategenerator =soup.find('input', {'id': '__VIEWSTATEGENERATOR'}).get('value')
eventvalidation =soup.find('input', {'id': '__EVENTVALIDATION'}).get('value')
cookies = session.cookies.get_dict()
cookie=""
for k,v in cookies.items():
cookie+=k+"="+v+";"
cookie = cookie[:-1]
#header copied from the requests.
headers={
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.76',
'X-KL-Ajax-Request':'Ajax_Request',
'X-MicrosoftAjax':'Delta=true',
'X-Requested-With':'XMLHttpRequest',
'Cookie':cookie
}
#post request data submission
data={
'ctl00$smMain':'ctl00$MainContent$ReportGrid$ctl103$ReportGrid_top_4',
'__EVENTTARGET':'ctl00$MainContent$ReportGrid$ctl103$ReportGrid_top_4',
'__VIEWSTATE':viewstate,
'__VIEWSTATEGENERATOR':viewstategenerator,
'__EVENTVALIDATION':eventvalidation,
'__ASYNCPOST':'true'
}
#need help with this part
result = requests.get(url,headers=headers,data=data)
print(result.headers)
data = {
"__EVENTTARGET":'ctl00$MainContent$btnExport',
'__VIEWSTATE':viewstate,
}
#remove ajax request for the last call to download excel file
del headers['X-KL-Ajax-Request']
del headers['X-MicrosoftAjax']
del headers['X-Requested-With']
result = requests.post(url,headers=headers,data=data,allow_redirects=True)
print(result.headers)
print(result.status_code)
#print(result.text)
with open("test.xlsx","wb") as f:
f.write(result.content)
I am trying to export excel file without selenium help, but I am not able to get the last step. I need help to convert xmlhttprequest to pure requests using python without any selenium

Scraping a Video from a website and downloading it as mp4

I am trying to scrape this webiste and download the soccer goal that is in it. I saw this Stackoverflow post and I tried the solution and it still does not work.
Here is the code I have done
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4p'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
when I printed the request it showed me 403 and because of that the video will not open or get downloaded, any thoughts on how I can download the video?
Use this below url.
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = 'https://cdn-cf-east.streamable.com/video/mp4/g6f986.mp4?Expires=1621994280&Signature=IqySuJxyVi9pCmC~JUhl-iyp-LmG6OiAFfQeu-~-a55osCfu9VrksEhzaQzJlMxAHcSt1R4j9Pt-G8sblQeFt3UtGqY-neHJkC4mUxuHjxGWAWdksyiAxkMb8DYRLkvIseUfkbKbeO6Dt807QwMkspFmXYdzljm8DLho6nMQfC--jtfy8B2gONhA9YUmK2o~fUHwTHzTXXqNGct2hQl-B9cFLDBdj8LXWTj-75YInwWxLwtoenKK~qLahGtJXKXvxTVltxMvUYXXvP9F~WfhNIhNqns1JKrrrqJ~N1XunZHCv~IVJyzOEvrn2G4J5LMIn~dcEZ9frV3APHsE4D~HQA__&Key-Pair-Id=APKAIEYUVEN4EVB2OKEQ'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)
This is how you extract the video link from Streamable if you want to in a programmatic way
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
url = "https://streamable.com/a50s3e"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# Pull all text from the BodyText div
video = soup.find_all('video')
temp = video[0].get("src")
url = "https:"+temp
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
with open('video.mp4', 'wb') as f_out:
r = requests.get(url, headers=headers, stream=True)
print(r)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f_out.write(chunk)

BeautifulSoup Python web scraping Missing html Main Body

i am using Beutifull soup to scrape this web page: https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03
Result: i get the javaScript, Css
Desired output: i need the main html
i used this code
import requests
from bs4 import BeautifulSoup
url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
I’m afraid you won’t be able to get it directly using BeautifulSoup because the page loads then a javascript loads data.
It’s one of the component’s limitations, you may need to use selenium.
please check the answers on this question
I think what you looking for is this:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
It will contain the text from the page including html tags

Python pproxy, make sock5 http request after creating Proxy using SSH

I am using python-proxy package for converting my ssh connection to a sock5 proxy, facing issues in making sock5 http request after creating server.
Using python package:https://github.com/qwj/python-proxy & reference example:https://github.com/qwj/python-proxy/blob/master/tests/api_server.py
import asyncio
import pproxy
loop = asyncio.get_event_loop()
async def make_request():
import requests
import socks
import socket
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "localhost", 8081)
socket.socket = socks.socksocket
proxies = {'http': "socks5://127.0.0.1:8081"}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
}
url = u'https://api.ipgeolocationapi.com/geolocate/5.152.122.170'
print(requests.get(url,verify=False,headers=headers).text)
return "test"
async def ssh_handle():
print("1")
server = pproxy.Server('socks5://127.0.0.1:8081')
remote = pproxy.Connection('ssh://185.110.12.11/#root:test')
args = dict(rserver=[remote],
verbose=print)
await server.start_server(args)
print("server started now")
await asyncio.sleep(1)
await make_request() #-- after creating server calling function of making http request using proxy
return "done"
try:
loop.run_until_complete(ssh_handle())
loop.run_forever()
except Exception as e:
print(e)```
[1]: https://github.com/qwj/python-proxy
[2]: https://github.com/qwj/python-proxy/blob/master/tests/api_server.py

Instagram Scraping with endpoints requires authentication for all requests

As you know, Instagram announced they has changed their endpoint apis this month.
Looks like in the wake of Cambridge Analytica instagram has changed up their endpoint formats and require a logged in user session for all requests.....
Not sure which endpoints need updating but I was specifically using the media/comments endpoints which are now as follows:
Media OLD:
https://www.instagram.com/graphql/query/?query_id=17888483320059182&id={0}&first=100&after={1}
Media NEW:
https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%7B%22id%22%3A%2221575514%22%2C%22first%22%3A12%2C%22after%22%3A%22AQAHXuz1DPmI3FFLOzy5iKEhHOLKw3lt_ozVR40TphSdns0Vp5j_ZEU6Qj0CW6IqNtVGO5pmLCQoX0Y8RVS9aRTT2lWPp6vf8vFqjo1QfxRYmA%22%7D
The script that I used for avoiding this problem is as following:
#!/usr/bin/env python3
import requests
import urllib.parse
import hashlib
import json
#CHROME_UA = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
def getSession_old(rhx_gis, csrf_token, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
print(variables)
values = "%s:%s:%s:%s" % (
rhx_gis,
csrf_token,
CHROME_UA,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'user-agent': CHROME_UA,
'x-instagram-gis': x_instagram_gis
}
print(x_instagram_gis)
session.cookies.set('ig_pr', '2')
session.cookies.set('csrftoken', csrf_token)
return session
def getSession(rhx_gis, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
values = "%s:%s" % (
rhx_gis,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'x-instagram-gis': x_instagram_gis
}
return session
if __name__ == '__main__':
session = requests.Session()
session.headers = { 'user-agent': CHROME_UA }
response = session.get("https://www.instagram.com/selenagomez")
data = json.loads(response.text.split("window._sharedData = ")[1].split(";</script>")[0])
csrf = data['config']['csrf_token']
rhx_gis = data['rhx_gis']
variables = '{"id":"460563723","first":10,"after":"AQBf8puhlt8nU2JzmYdMMTuH0FbMgUM1fnIOZIH7n94DM4VLWkVILUAKVB-5dqvxQEI-Wd0ttlEDzimaaqwC98jccQaDQT4tSF56c_NlWi_shg"}'
session = getSession(rhx_gis, variables)
query_hash = '42323d64886122307be10013ad2dcc44'
encoded_vars = urllib.parse.quote(variables, safe='"')
url = 'https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s' % (query_hash, encoded_vars)
print(url)
print(session.get(url).text)
I am sure this script was working well before 11 days ago, but not working now.
Does anyone know the solution how to get user posts without authenticating?

Resources