Why using cookie to login a website failed in python3? - python-3.x

I want to use cookie that i have got to access a website and get some information only can be seen after user login, but when me try it, the result shows the user isn't login the website, this is my code, someone can tell me how to fix the problem?
LOGIN_URL ="https://www.yaozh.com/login/"
values = {'username': 'username', 'pwd': 'password'} # , 'submit' : 'Login'
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
page = response.read().decode()
# print(page)
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
cookie.save(ignore_discard=True, ignore_expires=True)
print(cookie)
for item in cookie:
print('Name = ' + item.name)
print('Value = ' + item.value)
get_url = 'https://db.yaozh.com/instruct?p=1&pageSize=20'
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print(get_response.read())
bs=BeautifulSoup(get_response,"html.parser")
urls=bs.find_all(name='a',attrs={"href":re.compile("\.doc")},recursive=True)
print(len(urls))
for url in urls:
print(url["href"])

The problem has been solved, if you face same problem, i guess you should chcek the information post to server is all? Many websites need some imformation that true users can't see to judge the requester is true user. Good luck for you!

Related

How to do properly a facebook mobile site login

I'm trying to develop some code in order to make successful facebook logins. Now, to simplify as much as possible, i use the mbasic.facebook.com address.
My code is the following (using requests in python latest version):
if __name__ == '__main__':
base_url = 'https://mbasic.facebook.com'
with requests.session() as session:
user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.87 Safari/537.36"
)
cookie = 'locale=it_IT;'
default_headers = {
'User-Agent': user_agent,
'Accept-Language': 'it-IT,en;q=0.5',
'cookie': cookie,
}
session.headers.update(default_headers)
login_form_url = '/login/device-based/regular/login/?refsrc=https%3A%2F%2Fmbasic.facebook.com%2F&lwv=100&ref' \
'=dbl '
r = session.get("https://mbasic.facebook.com/login/")
page1 = BeautifulSoup(r.text, "lxml")
form = page1.find('form')
lsd = page1.find('input', {'name': 'lsd'})['value']
jazoest = page1.find('input', {'name': 'jazoest'})['value']
mts = page1.find('input', {'name': 'm_ts'})['value']
li = page1.find('input', {'name': 'li'})['value']
try_number = page1.find('input', {'name': 'try_number'})['value']
unrecognized_tries = page1.find('input', {'name': 'unrecognized_tries'})['value']
data = {'lsd': lsd, 'jazoest': jazoest, 'm_ts': mts, 'li': li, 'try_number': try_number,
'unrecognized_tries': unrecognized_tries, 'email': credentials["email"], 'pass': credentials["pass"],
'login': 'Accedi'}
r = session.post(base_url + login_form_url, data=data, verify=False)
# now, i need to complete the second part of the login
h = open("first_login.html", "w", encoding="utf-8")
h.write(r.text)
c = BeautifulSoup(r.text, "lxml")
form = c.find('a')
action = form.get('href')
r = session.get(base_url + action, data=data, verify=False)
f = open("second_login.html", "w", encoding="utf-8")
f.write(r.text)
Now, with this code i successfully get my home feed as a logged user. However, the problem begins when i try to move for instance to one specific facebook public page, because it returns me the page as if i wasn't logged in. The same weird thing happens when i try to get a specific post, because it doesn't show me any comments, like it does in my browser.
I tried to play with session cookies but to no avail.
Help
The solution was to change the user agent to:
Mozilla/5.0 (BB10; Kbd) AppleWebKit/537.35+ (KHTML, like Gecko) Version/10.3.3.3057 Mobile Safari/537.35+

Instagram Scraping with endpoints requires authentication for all requests

As you know, Instagram announced they has changed their endpoint apis this month.
Looks like in the wake of Cambridge Analytica instagram has changed up their endpoint formats and require a logged in user session for all requests.....
Not sure which endpoints need updating but I was specifically using the media/comments endpoints which are now as follows:
Media OLD:
https://www.instagram.com/graphql/query/?query_id=17888483320059182&id={0}&first=100&after={1}
Media NEW:
https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%7B%22id%22%3A%2221575514%22%2C%22first%22%3A12%2C%22after%22%3A%22AQAHXuz1DPmI3FFLOzy5iKEhHOLKw3lt_ozVR40TphSdns0Vp5j_ZEU6Qj0CW6IqNtVGO5pmLCQoX0Y8RVS9aRTT2lWPp6vf8vFqjo1QfxRYmA%22%7D
The script that I used for avoiding this problem is as following:
#!/usr/bin/env python3
import requests
import urllib.parse
import hashlib
import json
#CHROME_UA = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
def getSession_old(rhx_gis, csrf_token, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
print(variables)
values = "%s:%s:%s:%s" % (
rhx_gis,
csrf_token,
CHROME_UA,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'user-agent': CHROME_UA,
'x-instagram-gis': x_instagram_gis
}
print(x_instagram_gis)
session.cookies.set('ig_pr', '2')
session.cookies.set('csrftoken', csrf_token)
return session
def getSession(rhx_gis, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
values = "%s:%s" % (
rhx_gis,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'x-instagram-gis': x_instagram_gis
}
return session
if __name__ == '__main__':
session = requests.Session()
session.headers = { 'user-agent': CHROME_UA }
response = session.get("https://www.instagram.com/selenagomez")
data = json.loads(response.text.split("window._sharedData = ")[1].split(";</script>")[0])
csrf = data['config']['csrf_token']
rhx_gis = data['rhx_gis']
variables = '{"id":"460563723","first":10,"after":"AQBf8puhlt8nU2JzmYdMMTuH0FbMgUM1fnIOZIH7n94DM4VLWkVILUAKVB-5dqvxQEI-Wd0ttlEDzimaaqwC98jccQaDQT4tSF56c_NlWi_shg"}'
session = getSession(rhx_gis, variables)
query_hash = '42323d64886122307be10013ad2dcc44'
encoded_vars = urllib.parse.quote(variables, safe='"')
url = 'https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s' % (query_hash, encoded_vars)
print(url)
print(session.get(url).text)
I am sure this script was working well before 11 days ago, but not working now.
Does anyone know the solution how to get user posts without authenticating?

Python 3.6.4, Scraping a website that requires login

Login Address: https://joffice.jeunesseglobal.com/login.asp.
Two data need to put: Username and pw.
Using cookie to access:https://joffice.jeunesseglobal.com/members/back_office.asp
Can't login.
#-*-coding:utf8-*-
import urllib
import http.cookiejar
url = 'https://joffice.jeunesseglobal.com/members/back_office.asp'
login_url = "https://joffice.jeunesseglobal.com/login.asp"
login_username = "jianghong181818"
login_password = "Js#168168!"
login_data = {
"Username" : login_username,
"pw" : login_password,
}
post_data = urllib.parse.urlencode(login_data).encode('utf-8')
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
req = urllib.request.Request(login_url, headers = headers, data = post_data)
cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
resp = opener.open(req)
print(resp.read().decode('utf-8'))
Use requests
Simple way:
>>>import requests
>>>page = requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=
('username', 'password'))
Making requests with HTTP Basic Auth
>>> from requests.auth import HTTPBasicAuth
>>> requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=HTTPBasicAuth('user', 'pass'))

logging into a twitter using python3 and requests

I have a project that I am working on, and the requirements are to login to a website using a username and password. I have to do it in python, and then be able to access a part of the site only accessible to people who are logged in. I have tried a few variations of coding to do this, and haven't been able to successfully log in yet. Here is my coding:
the function to login to it:
def session2(url):
#r = requests.get(url)
#ckies = []
#print("here are the cookies for twitter:\n")
#for cky in r.cookies:
# print(cky.name, cky.value)
# ckies.append(cky)
s = requests.Session()
session = s.get(url, verify=False)
print("\nheaders from site\n")
print(session.headers)
tree = html.fromstring(session.text)
# extract the auth token needed to login along with username and password
auth_token = list(set(tree.xpath("//input[#name='authenticity_token']/#value")))[0]
uname = "username"
pword = "password"
username = 'session[username_or_email]'
password = 'session[password]'
# payload = {name of username variable : string you want, name of password variable:
# string you want, name of auth token: string gotten from session
payload = dict(username = uname, password = pword , authenticity_token = auth_token)
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'}
#do post request
# might have to change headers to be a header for chrome
response = s.post(
url,
data = payload,
#headers = dict(referer = url)
headers = header
)
print("\nheaders post\n")
print(response.request.headers)
session = s.get("http://www.twitter.com/username/followers", verify=False)
print("\nheaders get\n")
print(session.headers)
print("\nhtml doc\n")
print(session.text)
return session
code to call it:
url = "http://www.twitter.com/login"
sessions = session2(url)
the username on the site looks like this when you inspect it:
<input class="js-username-field email-input js-initial-focus" type="text" name="session[username_or_email]" autocomplete="on" value="" placeholder="Phone, email or username">
and the password section/token section look like this:
<input class="js-password-field" type="password" name="session[password]" placeholder="Password">
<input type="hidden" value="ef25cb09a8c7fe16c54e3df099e206e605b1170a" name="authenticity_token">
I know the auth token changes, which is why i have it get it from the function. When I try to run this, it just goes to the main page rather than the page i need.
One problem I think is that when I print out the header that I send in the post, it says:
{'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Accept': '/', 'User-Agent': 'python-requests/2.9.1'}
which I thought I changed to chrome's header, but it doesn't seem to stick.
Also, I know there is a way if I use Oauth, but I'm not allowed to use that, i have to do it based on being able to login like I'm using a browser.
Can you tell me if there is anything wrong with what I've done, as well as any hints on how to fix it? I've tried other stack overflow problems using requests and logging in, but those didn't work either.
EDIT: ok, i did a response.request.headers, and it came out with the right header, i think, so i don't think that is the problem
header it prints:
{'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36', 'Cookie': '_twitter_sess=some huge amount of number/letters; guest_id=v1%3A147509653977967101', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'}
This will log you in:
import requests
from bs4 import BeautifulSoup
username = "uname"
password = "pass"
# login url
post = "https://twitter.com/sessions"
url = "https://twitter.com"
data = {"session[username_or_email]": username,
"session[password]": password,
"scribe_log": "",
"redirect_after_login": "/",
"remember_me": "1"}
with requests.Session() as s:
r = s.get(url)
# get auth token
soup = BeautifulSoup(r.content, "lxml")
AUTH_TOKEN = soup.select_one("input[name=authenticity_token]")["value"]
# update data, post and you are logged in.
data["authenticity_token"] = AUTH_TOKEN
r = s.post(post, data=data)
print(r.content)
You can see if we run it using my own account, we get my name from my profile:
In [30]: post = "https://twitter.com/sessions"
In [31]: url = "https://twitter.com"
In [32]: data = {"session[username_or_email]": username,
....: "session[password]": password,
....: "scribe_log": "",
....: "redirect_after_login": "/",
....: "remember_me": "1"}
In [33]: with requests.Session() as s:
....: r = s.get(url)
....: soup = BeautifulSoup(r.content, "lxml")
....: AUTH_TOKEN = soup.select_one("input[name=authenticity_token]")["value"]
....: data["authenticity_token"] = AUTH_TOKEN
....: r = s.post(post, data=data)
....: soup = BeautifulSoup(r.content, "lxml")
....: print(soup.select_one("b.fullname"))
....:
<b class="fullname">Padraic Cunningham</b>
Just be aware each time you login, you will the We noticed a recent login for your account ... email.

Flurry scraping using python3 requests.Session()

This seems really straight forward, but for some reason this isn't connecting to flurry correctly and I unable to scrape the data.
loginurl = "https://dev.flurry.com/secure/loginPage.do"
csvurl = "https://dev.flurry.com/eventdata"
session = requests.Session()
login = session.post(loginurl, data={'loginEmail': 'user', 'loginPassword': 'pass'})
data = session.get(csvurl)
Every time I try to use this, I get redirected back to the login screen (loginurl) without fetching the new data. Has anyone been able to connect to flurry like this successfully before?
Any and all help would be greatly appreciated, thanks.
There are two more form fields to be populated struts.token.name and the value from struts.token.name i.e token, you also have to post to loginAction.do:
You can do an initial get and parse the values using bs4 then post the data:
from bs4 import BeautifulSoup
import requests
loginurl = "https://dev.flurry.com/secure/loginAction.do"
csvurl = "https://dev.flurry.com/eventdata"#
data = {'loginEmail': 'user', 'loginPassword': 'pass'}
with requests.Session() as session:
session.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36"})
soup = BeautifulSoup(session.get(loginurl).content)
name = soup.select_one("input[name=struts.token.name]")["value"]
data["struts.token.name"] = name
data[name] = soup.select_one("input[name={}]".format(name))["value"]
login = session.post(loginurl, data=data)

Resources