Flurry scraping using python3 requests.Session() - python-3.x

This seems really straight forward, but for some reason this isn't connecting to flurry correctly and I unable to scrape the data.
loginurl = "https://dev.flurry.com/secure/loginPage.do"
csvurl = "https://dev.flurry.com/eventdata"
session = requests.Session()
login = session.post(loginurl, data={'loginEmail': 'user', 'loginPassword': 'pass'})
data = session.get(csvurl)
Every time I try to use this, I get redirected back to the login screen (loginurl) without fetching the new data. Has anyone been able to connect to flurry like this successfully before?
Any and all help would be greatly appreciated, thanks.

There are two more form fields to be populated struts.token.name and the value from struts.token.name i.e token, you also have to post to loginAction.do:
You can do an initial get and parse the values using bs4 then post the data:
from bs4 import BeautifulSoup
import requests
loginurl = "https://dev.flurry.com/secure/loginAction.do"
csvurl = "https://dev.flurry.com/eventdata"#
data = {'loginEmail': 'user', 'loginPassword': 'pass'}
with requests.Session() as session:
session.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36"})
soup = BeautifulSoup(session.get(loginurl).content)
name = soup.select_one("input[name=struts.token.name]")["value"]
data["struts.token.name"] = name
data[name] = soup.select_one("input[name={}]".format(name))["value"]
login = session.post(loginurl, data=data)

Related

How to scrape an article which requires login to view full content using python?

I am trying to scrape an article from The Wall Street Journal and it requires log-in to view the whole content. So, I have written a code like the below using Python Requests:
import requests
from bs4 import BeautifulSoup
import re
import base64
import json
username= <username>
password= <password>
base_url= "https://accounts.wsj.com"
session = requests.Session()
r = session.get("{}/login".format(base_url))
soup = BeautifulSoup(r.text, "html.parser")
credentials_search = re.search("Base64\.decode\('(.*)'", r.text, re.IGNORECASE)
base64_decoded = base64.b64decode(credentials_search.group(1))
credentials = json.loads(base64_decoded)
connection = <connection_name>
r = session.post(
'https://sso.accounts.dowjones.com/usernamepassword/login',
data = {
"username": username,
"password": password,
"connection": connection,
"client_id": credentials["clientID"],
"state": credentials["internalOptions"]["state"],
"nonce": credentials["internalOptions"]["nonce"],
"scope": credentials["internalOptions"]["scope"],
"tenant": "sso",
"response_type": "code",
"protocol": "oauth2",
"redirect_uri": "https://accounts.wsj.com/auth/sso/login"
})
soup = BeautifulSoup(r.text, "html.parser")
login_result = dict([
(t.get("name"), t.get("value"))
for t in soup.find_all('input')
if t.get("name") is not None
])
r = session.post(
'https://sso.accounts.dowjones.com/login/callback',
data = login_result,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"},
)
# article get request
r = session.get(
"https://www.wsj.com/articles/singapore-prime-minister-lee-rejects-claims-he-misused-state-powers-in-family-feud-1499094761",
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"}
)
print(r.text)
am able to login through the request but still I am not getting full article to scrape. Can anyone help me with this? Thanks in advance :-)
An easy and reliable solution is using the selenium webdriver.
With selenium you create an automated browser window which opens the website and from there you can make it choose the elements to log in. Then the content loads in as usual, like when you look for it manually on your browser.
Then you can soup that page with BeautifulSoup.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox(executable_path="C:\Program Files (x86)\geckodriver.exe")
# Download the driver for your desired browser and place it in any path
# for Chrome it's: driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
# open your website link
driver.get("https://www.your-url.com")
# then soup the page with BS
html = driver.page_source
page_soup = BeautifulSoup(html)
From there you can use the "page_soup" as you usually would.
Any questions? :)

Python: Can't extract tbody information from website

I want to extract all links of this website: https://pflegefinder.bkk-dachverband.de/pflegeheime/searchresult.php?required=1&statistics=1&searchdata%5BmaxDistance%5D=0&searchdata%5BcareType%5D=inpatientCare#/tab/general
The information I want are stored in the tbody: page code
Every time I try to extract the data I get no result.
from bs4 import BeautifulSoup
import requests
from requests_html import HTMLSession
url = "https://pflegefinder.bkk-dachverband.de/pflegeheime/searchresult.php?required=1&statistics=1&searchdata%5BmaxDistance%5D=0&searchdata%5BcareType%5D=inpatientCare#complex-searchresult"
session = HTMLSession()
r = session.get(url)
r.html.render()
soup = BeautifulSoup(r.html.html,'html.parser')
print(r.html.search("Details"))
Thank you for your help!
The site uses a backend api to deliver the info, if you look at your browser's Developer Tools - Network - fetch/XHR and refresh the page you'll see the data load via json in a request with a similar url to the one you posted.
You can scrape that data like this, it returns json which is easy enough to parse:
import requests
headers = {
'Referer':'https://pflegefinder.bkk-dachverband.de/pflegeheime/searchresult.php?required=1&statistics=1&searchdata%5BmaxDistance%5D=0&searchdata%5BcareType%5D=inpatientCare',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
for page in range(2):
url = f'https://pflegefinder.bkk-dachverband.de/api/nursing-homes?required=1&statistics=1&maxDistance=0&careType=inpatientCare&limit=20&offset={page*20}'
resp = requests.get(url,headers=headers).json()
print(resp)
The api checks that you have a "Referer" header otherwise you get a 400 response.

Using scrapy to scrape food aggregators like grubhub need it for some personal data science purpose

I am trying to figure out a way to find the data out of the website using splash but have no result, I need your help on figuring out a way to do that!
edit:
link:https://www.grubhub.com/search?location=10001(link edited for brevity)
and similarly, this for different states based on ZIP code and data I need is the name of the restaurant and its menu and ratings for all possible or available data.
I tried to reverse-engineer their API but you might have to adjust it to your needs (and probably optimise it according to your needs):
We will have to get an authentication bearer in order to use their API. To get the token we will first need a client_id:
libraries I am using
import requests
from bs4 import BeautifulSoup
import re
import json
get client_id
session = requests.Session()
static = 'https://www.grubhub.com/eat/static-content-unauth?contentOnly=1'
soup = BeautifulSoup(session.get(static).text, 'html.parser')
client = re.findall("beta_[a-zA-Z0-9]+", soup.find('script', {'type': 'text/javascript'}).text)
# print(client)
get authentication bearer
# define and add a proper header
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'authorization': 'Bearer',
'content-type': 'application/json;charset=UTF-8'
}
session.headers.update(headers)
# straight from networking tools. Device ID appears to accept any 10-digit value
data = '{"brand":"GRUBHUB","client_id":"' + client[0] + '","device_id":1234567890,"scope":"anonymous"}'
resp = session.post('https://api-gtm.grubhub.com/auth', data=data)
# refresh = json.loads(resp.text)['session_handle']['refresh_token']
access = json.loads(resp.text)['session_handle']['access_token']
# update header with new token
session.headers.update({'authorization': 'Bearer ' + access})
make your request
Here comes the part that is different from your request: their API uses a third-party service to get the longitude and latitude for a ZIP code (i.e. -73.99916077, 40.75368499) for your New York ZIP code). There might even be an option to change that: location=POINT(-73.99916077%2040.75368499) looks like it accepts other options, as well.
grub = session.get('https://api-gtm.grubhub.com/restaurants/search/search_listing?orderMethod=delivery&locationMode=DELIVERY&facetSet=umamiV2&pageSize=20&hideHateos=true&searchMetrics=true&location=POINT(-73.99916077%2040.75368499)&facet=promos%3Atrue&facet=open_now%3Atrue&variationId=promosSponsoredRandom&sortSetId=umamiv3&countOmittingTimes=true')

I want to open the first link that appear when i do a search on google

I want to get the first link from the html parser, but I'm getting anything(tried to print).
Also when i inspect the page on browser, the links are under class='r'
But when i print the soup.prettify(), and closely analyse then i find there is no class='r', instead class="BNeawe UPmit AP7Wnd".
Please help, thanks in advance!
import requests
import sys
import bs4
import webbrowser
def open_web(query):
res = requests.get('https://google.com/search?q=' + query)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
link_elements = soup.select('.r a')
link_to_open = min(1, len(link_elements))
for i in range(link_to_open):
webbrowser.open('https://google.com' + link_elements[i].get('href'))
open_web('youtube')
The problem is that google serves different HTML when you don't specify User-Agent in headers. To add User-Agent to your request, put it in the headers= attribute:
import requests
import bs4
def open_web(query):
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'}
res = requests.get('https://google.com/search?q=' + query, headers=headers)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
link_elements = soup.select('.r a')
print(link_elements)
open_web('youtube')
Prints:
[<a href="https://www.youtube.com/?gl=EE&hl=et" onmousedown="return rwt(this,'','','','1','AOvVaw2lWnw7oOhIzXdoFGYhvwv_','','2ahUKEwjove3h7onkAhXmkYsKHbWPAUYQFjAAegQIBhAC','','',event)"><h3 class="LC20lb">
... and so on.
You received a completely different HTML with different elements and selectors thus the output is empty. The reason why Google blocks your request is because default requests user-agent is python-requests and Google understands it and blocks it. Check what's your user-agent.
User-agent let identifies the browser, its version number, and its host operating system that representing a person (browser) in a Web context that lets servers and network peers identify if it's a bot or not.
Sometimes you can receive a different HTML, with different selectors.
You can pass URL params as a dict() which is more readable and requests do everything for you automatically (same goes for adding user-agent into headers):
params = {
"q": "My query goes here"
}
requests.get("YOUR_URL", params=params)
If you want to get the very first link then use select_one() instead.
Code and full example in the online IDE:
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "My query goes here"
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
link = soup.select_one('.yuRUbf a')['href']
print(link)
# https://dev.mysql.com/doc/refman/8.0/en/entering-queries.html
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you only need to extract the data you want from JSON string rather than figuring out how to extract, maintain or bypass blocks from Google.
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"engine": "google",
"q": "My query goes here",
"hl": "en",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
# [0] means first index of search results
link = results['organic_results'][0]['link']
# https://dev.mysql.com/doc/refman/8.0/en/entering-queries.html
Disclaimer, I work for SerpApi.

Instagram Scraping with endpoints requires authentication for all requests

As you know, Instagram announced they has changed their endpoint apis this month.
Looks like in the wake of Cambridge Analytica instagram has changed up their endpoint formats and require a logged in user session for all requests.....
Not sure which endpoints need updating but I was specifically using the media/comments endpoints which are now as follows:
Media OLD:
https://www.instagram.com/graphql/query/?query_id=17888483320059182&id={0}&first=100&after={1}
Media NEW:
https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%7B%22id%22%3A%2221575514%22%2C%22first%22%3A12%2C%22after%22%3A%22AQAHXuz1DPmI3FFLOzy5iKEhHOLKw3lt_ozVR40TphSdns0Vp5j_ZEU6Qj0CW6IqNtVGO5pmLCQoX0Y8RVS9aRTT2lWPp6vf8vFqjo1QfxRYmA%22%7D
The script that I used for avoiding this problem is as following:
#!/usr/bin/env python3
import requests
import urllib.parse
import hashlib
import json
#CHROME_UA = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
def getSession_old(rhx_gis, csrf_token, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
print(variables)
values = "%s:%s:%s:%s" % (
rhx_gis,
csrf_token,
CHROME_UA,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'user-agent': CHROME_UA,
'x-instagram-gis': x_instagram_gis
}
print(x_instagram_gis)
session.cookies.set('ig_pr', '2')
session.cookies.set('csrftoken', csrf_token)
return session
def getSession(rhx_gis, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
values = "%s:%s" % (
rhx_gis,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'x-instagram-gis': x_instagram_gis
}
return session
if __name__ == '__main__':
session = requests.Session()
session.headers = { 'user-agent': CHROME_UA }
response = session.get("https://www.instagram.com/selenagomez")
data = json.loads(response.text.split("window._sharedData = ")[1].split(";</script>")[0])
csrf = data['config']['csrf_token']
rhx_gis = data['rhx_gis']
variables = '{"id":"460563723","first":10,"after":"AQBf8puhlt8nU2JzmYdMMTuH0FbMgUM1fnIOZIH7n94DM4VLWkVILUAKVB-5dqvxQEI-Wd0ttlEDzimaaqwC98jccQaDQT4tSF56c_NlWi_shg"}'
session = getSession(rhx_gis, variables)
query_hash = '42323d64886122307be10013ad2dcc44'
encoded_vars = urllib.parse.quote(variables, safe='"')
url = 'https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s' % (query_hash, encoded_vars)
print(url)
print(session.get(url).text)
I am sure this script was working well before 11 days ago, but not working now.
Does anyone know the solution how to get user posts without authenticating?

Resources