Scraping from site that requires login, how to access the contents? - python-3.x

So I am trying to scrape a website that requires a login. I have used requests and submitted my login details, although when I try to extract the data from the website, I am not getting the website I am looking for.
USERNAME = "test#gmail.com"
PASSWORD = "test"
#MIDDLEWARE_TOKEN = "TESTTOKEN"
LOGIN_URL = "https://vrdistribution.com.au/auth/login/process"
VR_URL = "https://vrdistribution.com.au/categories/tabletop-gaming?page=1"
def main():
session_requests = requests.session()
# Get login csrf token
result = session_requests.get(LOGIN_URL)
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("//input[#name='_token']/#value")))
# Create payload
payload = {
"email": USERNAME,
"password": PASSWORD,
"csrfmiddlewaretoken": authenticity_token
}
# Perform login
result = session_requests.post(LOGIN_URL, data = payload, headers = dict(referer = LOGIN_URL))
#Scrape
result = session_requests.get(VR_URL, headers =dict(referer=VR_URL))
response = requests.get(VR_URL)
soup = BeautifulSoup(response.text, 'lxml')
print(soup)
The output is not the same contents as the VR_URL(https://vrdistribution.com.au/categories/tabletop-gaming?page=1) that I had specified, when I inspect the page I want to scrape as opposed to the output of the soup object, it is completely different.
Is there a way for me to access and scrape contents off the VR_URL?

Related

mg_data = requests.get(soup.select_one("capimg").attrs["src"].replace("data:image/png;base64,", "")).content

Am getting the error mg_data = requests.get(soup.select_one("capimg").attrs["src"].replace("data:image/png;base64,", "")).content
i
# URL of the website
url = "https://enquiry.icegate.gov.in/enquiryatices/dgftTrackIEC"
for iec_number in iec_numbers:
# request the website and extract the captcha
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")
img_data = requests.get(soup.select_one("capimg").attrs["src"].replace("data:image/png;base64,", "")).content
captcha = extract_captcha(img_data)
# data to be sent in the request
data = {
"iecNO": iec_number,
"sbStartDate": start_date,
"sbEndDate": end_date,
"captchaResp": captcha,
"Submit": "Submit"
}
# post the request to the website
resp = requests.post(url, data=data)
soup = BeautifulSoup(resp.text, "html.parser")
Am trying to Scrap the data from the mentioned Website

python using requests and a webpage with a login issue

I'm trying to login to a website via python to print the info. So I don't have to keep logging into multiple accounts.
In the tutorial I followed, he just had a login and password, but this one has
Website Form Data
Does the _wp attributes change each login?
The code I use:
mffloginurl = ('https://myforexfunds.com/login-signup/')
mffsecureurl = ('https://myforexfunds.com/account-2')
payload = {
'log': '*****#gmail.com',
'pdw': '*****'
'''brandnestor_action':'login',
'_wpnonce': '9d1753c0b6',
'_wp_http_referer': '/login-signup/',
'_wpnonce': '9d1753c0b6',
'_wp_http_referer': '/login-signup/'''
}
r = requests.post(mffloginurl, data=payload)
print(r.text)
using the correct details of course, but it doesn't login.
I tried without the extra wordpress elements and also with them but it still just goes to the signin page.
python output
different site addresses, different login details
Yeah the nonce will change with every new visit to the page.
I would use request.session() so that it automatically stores session cookies and all that good stuff.
Do a session.GET('some_login_page.com')
Parse with the response content with BeautifulSoup to retrieve the nonce.
Then add that into the payload of your POST request when you login.
A very quick and dirty example:
import requests
from bs4 import BeautifulSoup as bs
email = 'test#email.com'
password = 'password1234'
url = 'https://myforexfunds.com/account-2/'
# Start a session
with requests.session() as session:
# Send a GET request to the login page
r = session.get(url)
# Check if the request was successful
if r.status_code != 200:
print("Get Request Failed")
# Parse the HTML content of the page
soup = bs(r.content, 'lxml')
# Extract the value of the nonce from the HTML
nonce = soup.find(id='woocommerce-login-nonce')['value']
# Set up the login form data
params ={
"username": email,
"password": password,
"woocommerce-login-nonce": nonce,
"_wp_http_referer": "/account-2/",
"login": "Log+in"
}
# Send a POST request with the login form data
r = session.post(url, params=params)
# Check if the request was successful
if r.status_code != 200:
print("Login Failed")

How to get access_token from fyers API?

I'm looking to get access_token from fyers API
I'm able to get authorization_code and build authorization_url to open it in browser to enter user credentials. access_token is displayed in browser's address when user enters credentials but my program is unable to retrieve the access_code.
Your help is much appreciable.
My code is as follows:
from fyers_api import accessToken
from fyers_api import fyersModel
import requests
import webbrowser
import urllib.request as ur
app_id = "XXXXXXXXX"
app_secret = "XXXXXXXXX"
app_session = accessToken.SessionModel(app_id, app_secret)
response = app_session.auth()
if response['code'] != 200:
print('CODE=' + str(response['code']))
print('MESSAGE=' + str(response['message']))
print('Exiting program...')
exit(0)
authorization_code = response['data']['authorization_code']
app_session.set_token(authorization_code)
authorization_url=app_session.generate_token('XXXXXX')
token = webbrowser.open(authorization_url)
#Following authorization url is opened in browser:
#https://api.fyers.in/api/v1/genrateToken?authorization_code=xxxxxxxxxxxxx&appId=xxxxxxxxx&user_id=xxxxxx
#User is redirected to following url after successful log-in:
#https://trade.fyers.in/?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx=&user_id=xxxxxx
print(token)
#token=”your_access_token”
#is_async = False #(By default False, Change to True for asnyc API calls.)
#fyers = fyersModel.FyersModel(is_async)
#fyers. get_profile(token = token)
Instead of writing the mentioned code, it is better to directly call Fyers Api.
import requests
url = 'https://api.fyers.in/api/v1/token'
requestParams = {
"fyers_id":"Your Client ID",
"password":"Your Password",
"pan_dob":"Your PAN card or DOB(DD-MM-YYYY)",
"appId":"YOur APP ID",
"create_cookie":False}
response = requests.post(url, json = requestParams )
print (response.text)
from fyers_api import accessToken
from fyers_api import fyersModel
app_id = "xxxxxxxxxx"
app_secret = "xxxxxxxxxx"
app_session = accessToken.SessionModel(app_id, app_secret)
response = app_session.auth()
print(app_session)
print(response)
authorization_code = response['data']['authorization_code']
app_session.set_token(authorization_code)
gen_token = app_session.generate_token()
print("token url is copy paste this url in browser and copy access
token excluding your id at Last ")
print(gen_token)
print("tokent printed thanks")
token="gAAAAABeTWk7AnufuuQQx0D0NkgABinWk7AnufuuQQx0DQ3ctAFWk7AnufuuQQx0DMQQwacJ-
_xUVnrTu2Pk5K5QCLF0SZmw7nlpaWk7AnufuuQQx0DG4_3EGCYw92-iAh8="
is_async = False
fyers = fyersModel.FyersModel(is_async)
print(fyers. get_profile(token = token))
fyers.funds(token = token)
print(fyers.funds(token = token))

How to get access to a webpage with multi-factor authentication using python

I'm trying to access newclasses.nyu.edu to create a program that involves scraping the website but I need to first login with my student details and then enable MFA from my phone. How can I do this?
I have been able to get the user login details and store it in a dictionary but for some reason it shows invalid password
#Getting user_data
def login():
login_data = {}
NetID = input("Enter your NetID")
password = input("Enter your password")
login_data['j_username'] = NetID
login_data['j_password'] = password
login_data['_eventId_proceed'] = ''
return login_data
with requests.Session() as s: #Maintain a session
url = "https://newclasses.nyu.edu"
request = s.get(url) #Getting access to the site
#soup = BeautifulSoup(request.content, 'html5lib') #Scraping the site
login_data = login()
print(login_data)
request = s.get(url, data = login_data)
print(request.content)
This code prints out the webpage showing an invalid password but what I want is to be able to login and activate the MFA and get access to the site

Trying to scrape a website that requires login

so i m new to this and been at it for almost a week now trying to scrape a website i use to collect analytics data (think of it like google analytics).
I tried playing around with xpath to figure out what this script is able to pull but all i get is "[]" as an output after running it.
Please help me find what i'm missing.
import requests
from lxml import html
#credentials
payload = {
'username': '<my username>',
'password': '<my password>',
'csrf-token': '<auth token>'
}
#open a session with login
session_requests = requests.session()
login_url = '<my website>'
result = session_requests.get(login_url)
#passing the auth token
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath('//input[#name=\'form_token\']/#value')))[0]
result = session_requests.post(
login_url,
data=payload,
headers=dict(referer=login_url)
)
#scrape the analytics dashboard from this event link
url = '<my analytics webpage url>'
result = session_requests.get(
url,
headers=dict(referer=url)
)
#print output using xpath to find and load what i need
trees = html.fromstring(result.content)
bucket_names = trees.xpath("//*[#id='statistics_dashboard']/div[1]/text()")
print(bucket_names)
print(result.ok)
print(result.status_code)
..........
this is what i get as a result
[]
True
200
Process finished with exit code 0
which is a big step for me because i've been getting so many errors to get to this point.

Resources